Objective

  • Understand crime in WDC by exploring data at both macro and micro levels, and then make predictions.

Goals

  1. Find total offenses by each factor group
    • Historically
    • Latest Year 2017
  2. Find total offenses by each offense type/ward group
    • Historically
    • Latest Year 2017
  3. Trend over time (last 3 years)
    • total offenses
    • total offenses by type
  4. Predict total offenses for the next 4 weeks
  5. Create Geographic Maps
    • by offense type
      • by last 3 available years

Get Data

a = read_csv('raw.csv') %>% 
  clean_names() %>% 
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    report_dat,
    month, day, year, hour, minute, second,
    where(is.Date),
    where(is.character),
    where(is.factor),
    where(is.numeric)
  ) %>% slice_sample(prop = 1) #work with 5% of dataset for now

Notes

  1. rmarkdown variable for filtering entire doc to specific year

Resources

  1. DC Wards

sample data

a %>% sample_n(10)

glimpse structure

a %>% glimpse
## Rows: 342,867
## Columns: 30
## $ report_dat           <chr> "4/26/2008 11:10:00 AM", "2/12/2009 11:04:00 P...
## $ month                <dbl> 4, 2, 3, 7, 6, 3, 12, 4, 8, 8, 8, 9, 1, 4, 8, ...
## $ day                  <dbl> 26, 12, 16, 22, 23, 22, 28, 16, 8, 31, 21, 27,...
## $ year                 <dbl> 2008, 2009, 2012, 2009, 2017, 2012, 2012, 2014...
## $ hour                 <dbl> 11, 23, 11, 19, 18, 13, 13, 12, 14, 2, 20, 16,...
## $ minute               <dbl> 10, 4, 30, 30, 59, 12, 39, 23, 0, 40, 44, 13, ...
## $ second               <dbl> 0, 0, 0, 0, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, ...
## $ anc                  <chr> "2C", "1B", "5E", "5A", "1A", "2F", "7C", "2B"...
## $ block                <chr> "700 - 723 BLOCK OF 14TH STREET NW", "2390 - 2...
## $ block_group          <chr> "005800 1", "003400 2", "008702 1", "009501 1"...
## $ crimetype            <chr> "Non-Violent", "Non-Violent", "Non-Violent", "...
## $ end_date             <chr> "4/26/2008 10:41:00 AM", "2/12/2009 12:00:00 A...
## $ ew                   <chr> "East", "East", "East", "East", "East", "East"...
## $ method               <chr> "OTHERS", "OTHERS", "OTHERS", "OTHERS", "OTHER...
## $ neighborhood_cluster <chr> "Cluster 8", "Cluster 3", "Cluster 21", "Clust...
## $ ns                   <chr> "North", "North", "North", "North", "North", "...
## $ offense              <chr> "THEFT/OTHER", "THEFT/OTHER", "THEFT F/AUTO", ...
## $ quad                 <chr> "Northeast", "Northeast", "Northeast", "Northe...
## $ shift                <chr> "DAY", "MIDNIGHT", "DAY", "EVENING", "EVENING"...
## $ start_date           <chr> "4/26/2008 10:40:00 AM", "2/12/2009 4:30:00 PM...
## $ voting_precinct      <chr> "Precinct 129", "Precinct 37", "Precinct 75", ...
## $ ccn                  <dbl> 8055446, 9020445, 12036569, 9103366, 17107226,...
## $ census_tract         <dbl> 5800, 3400, 8702, 9501, 3000, 4902, 7804, 4300...
## $ district             <dbl> 2, 3, 5, 4, 3, 3, 6, 3, 4, 3, 2, 7, 5, 6, 5, 5...
## $ psa                  <dbl> 207, 306, 502, 405, 302, 308, 602, 301, 401, 3...
## $ ward                 <dbl> 2, 1, 5, 5, 1, 2, 7, 2, 4, 1, 2, 8, 5, 8, 5, 5...
## $ x                    <dbl> 60706, 295890, 121333, 114544, 323805, 48697, ...
## $ x1                   <dbl> 60706, 295890, 121333, 114544, 323805, 48697, ...
## $ xblock               <dbl> -77.03196, -77.02129, -76.99953, -77.00746, -7...
## $ yblock               <dbl> 38.89885, 38.92368, 38.91687, 38.94822, 38.929...

counts of unique levels

a %>% map_df(n_unique)

clean data

#remove unused cols
a = a %>% select(!c(minute, second, anc, block, block_group, end_date, ew, neighborhood_cluster, ns, start_date, voting_precinct, ccn, district, x, x1)) %>%
  mutate(
    report_dat = anytime::anydate(report_dat),
    #start_date = anytime::anydate(start_date),
    #end_date = anytime::anydate(end_date),
    across(where(is.character), factor),
    census_tract = factor(census_tract, levels = a$census_tract %>% unique %>% sort),
    ward = factor(ward, levels = a$ward %>% unique %>% sort),
    psa = factor(psa, levels = a$psa %>% unique %>% sort),
    year = factor(year, levels = a$year %>% unique %>% sort),
    month = factor(month, levels = a$month %>% unique %>% sort),
    day = factor(day, levels = a$day %>% unique %>% sort),
    hour = factor(hour, levels = a$hour %>% unique %>% sort)
    ) %>%
  select(sort(tidyselect::peek_vars())) %>% 
  select(
    where(is.Date),
    month, day, year, hour,
    where(is.character),
    where(is.factor),
    where(is.numeric) 
    ) %>% arrange(report_dat, crimetype, offense)
    
abak = a
#saveRDS(abak, 'cleaned_data_10yrs.RDS')
#a = abak

Notes

  1. If we wanted to do more detailed geographic analysis, we might want to include block and block group
  2. psa = police service area

EDA: factor vars

sample data

a %>% select(where(is.factor)) %>% sample_n(10)

glimpse structure

a %>% select(where(is.factor)) %>% glimpse
## Rows: 10,594
## Columns: 12
## $ month        <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ day          <fct> 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, ...
## $ year         <fct> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, ...
## $ hour         <fct> 3, 13, 8, 21, 16, 20, 20, 7, 11, 3, 8, 13, 8, 21, 17, ...
## $ census_tract <fct> 7809, 9204, 10600, 7603, 3301, 10200, 7803, 9810, 1090...
## $ crimetype    <fct> Non-Violent, Non-Violent, Non-Violent, Non-Violent, No...
## $ method       <fct> OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS...
## $ offense      <fct> BURGLARY, MOTOR VEHICLE THEFT, THEFT F/AUTO, MOTOR VEH...
## $ psa          <fct> 608, 502, 501, 606, 501, 105, 602, 708, 708, 403, 107,...
## $ quad         <fct> Northeast, Northeast, Northeast, Southeast, Northeast,...
## $ shift        <fct> MIDNIGHT, DAY, DAY, EVENING, EVENING, EVENING, EVENING...
## $ ward         <fct> 7, 5, 6, 7, 5, 6, 7, 8, 8, 4, 6, 8, 5, 1, 1, 6, 6, 1, ...

check missing values

a %>% select(where(is.factor)) %>% miss_var_summary

With so little data missing, I feel comfortable omitting rows with NAs

a = a %>% na.omit()

distribution of level counts per factor

#defining custom color palette
jpal = grDevices::colorRampPalette(brewer.pal(8,'Dark2'))(25)

a %>% select(where(is.factor)) %>% map_df(n_unique)
a %>% select(where(is.factor)) %>%
  map_df(n_unique) %>%
  pivot_longer(. , everything(), 'features') %>%
  plot_ly(x = ~features, y = ~value, color = ~features, colors = jpal) %>% 
  add_bars() %>%
  layout(title = 'Unique Level Counts per Factor')

reference: names of unique levels

a %>% select(where(is.factor)) %>% map(unique)
## $month
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12
## 
## $day
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
## 
## $year
## [1] 2012 2013 2014 2015 2016 2017
## Levels: 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
## 
## $hour
##  [1] 3  13 8  21 16 20 7  11 17 14 9  10 4  18 12 15 2  0  19 6  22 1  23 5 
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## 
## $census_tract
##   [1] 7809  9204  10600 7603  3301  10200 7803  9810  10900 2002  8200  7504 
##  [13] 8803  2802  3000  8410  3200  5201  2702  9604  9505  9504  11100 9102 
##  [25] 4002  9700  4802  6600  4400  11000 4600  9803  502   600   8701  9507 
##  [37] 6202  3700  5800  2701  7502  5002  9503  9903  9811  2202  10400 4801 
##  [49] 4100  8804  8904  9603  8903  9203  3100  1702  5600  9509  10100 4902 
##  [61] 7304  1001  7709  202   8100  8402  5500  1901  2900  8302  9601  6801 
##  [73] 2101  9400  9301  7409  7708  4702  3400  6400  7707  400   5001  7401 
##  [85] 4202  10700 2502  6500  1600  2102  4300  7403  7200  9602  100   7404 
##  [97] 3800  3600  9802  8001  9904  5900  7703  2600  3302  8802  7100  9901 
## [109] 2801  300   501   7901  7406  2501  7503  8301  1100  9907  10300 7407 
## [121] 1002  4901  9807  7605  8002  9508  9501  7408  5301  1402  4001  901  
## [133] 3500  8702  7000  2201  10800 2400  7604  1804  10500 9000  9302  1302 
## [145] 7808  1803  9905  1200  7601  9804  2001  9201  7804  4201  9906  9902 
## [157] 9801  4701  7806  702   3900  802   701   7807  6802  6900  2302  6804 
## [169] 2301  7903  201   1401  801   1301  6700  1902  7301  1500  902  
## 179 Levels: 100 201 202 300 400 501 502 600 701 702 801 802 901 902 ... 11100
## 
## $crimetype
## [1] Non-Violent Violent    
## Levels: Non-Violent Violent
## 
## $method
## [1] OTHERS GUN    KNIFE 
## Levels: GUN KNIFE OTHERS
## 
## $offense
## [1] BURGLARY                   MOTOR VEHICLE THEFT       
## [3] THEFT F/AUTO               THEFT/OTHER               
## [5] ASSAULT W/DANGEROUS WEAPON ROBBERY                   
## [7] SEX ABUSE                  ARSON                     
## [9] HOMICIDE                  
## 9 Levels: ARSON ASSAULT W/DANGEROUS WEAPON BURGLARY ... THEFT/OTHER
## 
## $psa
##  [1] 608 502 501 606 105 602 708 403 107 701 506 302 104 409 307 408 603 406 504
## [20] 503 303 706 308 305 707 204 404 102 304 101 702 405 103 505 306 208 507 401
## [39] 207 705 202 605 206 402 601 108 704 703 301 203 106 604 201 407 607 205
## 56 Levels: 101 102 103 104 105 106 107 108 201 202 203 204 205 206 207 ... 708
## 
## $quad
## [1] Northeast Southeast Northwest
## Levels: Northeast Northwest Southeast
## 
## $shift
## [1] MIDNIGHT DAY      EVENING 
## Levels: DAY EVENING MIDNIGHT
## 
## $ward
## [1] 7 5 6 8 4 1 2 3
## Levels: 1 2 3 4 5 6 7 8

Goal 1. Find total offenses by each factor group X

Historically (2012 - 2016)

a %>% filter(year != 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - Historic')
## 2 columns ignored with more than 50 categories.
## census_tract: 179 categories
## psa: 56 categories

Observations:

  1. Most offenses occurs late summer to early fall (top 3 months: Oct, July, Aug)
  2. More offenses occurs towards the latter half the month (>= Day 16)
  3. More offenses occurs towards the end of the month (top 3 months: July, August, June)
  4. Non-violent offenses occur ~4.5 x as much as violent offenses
  5. Theft is, by far, the most common offense
    • auto-theft is so predominant, it has its own category
  6. The Northeast quad is significantly more dangerous than the other wards
    • OR… we collect data more accurately/frequently for this quad?
  7. Ward 2 is the most dangerous ward by a good margin
    • OR… we collect data more accurately/frequently for this ward?

Latest Year 2017

a %>% filter(year == 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - 2017')
## 2 columns ignored with more than 50 categories.
## census_tract: 174 categories
## psa: 56 categories

Observations relative to Historic

  1. January 2017 was atypically more dangerous
  2. Non-violent offenses make up a greater percent of total offenses, occurring about ~6 x as much as violent offenses
  3. Wards 5 and 7 are more dangerous than they were historically; 6 is safer

Goal 2. Find total offenses by each offense type/ward group

Historically (2012 - 2016)

ggplotly(
  a %>% filter(year != 2017) %>%
    count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
    geom_col() +
    coord_flip() +
    labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - Historic') +
    facet_wrap(~ward) +
    theme(legend.position = 'none')
)

Observations:

  1. Ward 2 has the greatest number of cases as observed above, but this is disproportionately mostly composed of theft/other offenses
    • the offense count of its second largest offense category, theft/auto, is even slightly less than that of Ward 1’s’
  2. Ward 7 has a relatively high proportion of motor vehicle theft offenses
  3. Ward 8 has a relatively high proportion of burglary offenses

Latest Year 2017

ggplotly(
  a %>% filter(year == 2017) %>% 
    count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
    geom_col() +
    coord_flip() +
    labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - 2017') +
    facet_wrap(~ward) +
    theme(legend.position = 'none')
)

Observations relative to Historic

  1. Ward 2’s share of theft f/auto is higher
  2. Relative to other wards, Wards 5,6,7 are more dangerous than they were historically
    • on an absolute basis, all wards are less dangerous
      • see time series graphs below

Goal 3. Trend over time

# create col for start of month (a 'month' col) used for grouping and graphing
a = a %>% mutate(
  monthkey = lubridate::make_datetime(
    as.numeric(as.character(year)),
    as.numeric(as.character(month)),
    1)
  ) %>% relocate(report_dat, monthkey, everything())

#Total Offenses over time

Total Offenses by Month

a %>% group_by(monthkey) %>%
  summarise(total.offenses = n()) %>%
  ungroup() %>% 
  plot_ly(x = ~monthkey, y =~total.offenses) %>%
  add_lines(size = I(3)) %>% layout(
  xaxis = list(title = ''),
  yaxis = list(title = ''),
  title = 'Total Offenses by Month'
  )

Observations

  1. Crime follows clear seasonality
    • peaks occurring late summer/early fall
  2. Peaked in mid 2014
  3. Troughed in early 2015

Total Offenses by Month/Type

ggplotly(
  a %>% group_by(monthkey, offense)%>%
    summarise(total.offenses = n()) %>%
    ungroup() %>%
    mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses, fill = offense)) +
    geom_area() +
    labs(title = 'Total Offenses Percentage (#) by Month/Offense Type', x = '', y = '')
)
ggplotly(
  a %>% group_by(monthkey, offense)%>%
    summarise(total.offenses = n()) %>%
    mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>% 
    ungroup() %>% 
    mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses.pct, fill = offense)) +
    geom_area() +
    scale_y_continuous(labels = scales::percent) +
    labs(title = 'Total Offenses Percentage (%) by Month/Offense Type', x = '', y = '')
)

Total Offenses by Month/Ward

ggplotly(
  a %>% group_by(monthkey, ward)%>%
    summarise(total.offenses = n()) %>%
    ungroup() %>%
    mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses, fill = ward)) +
    geom_area() +
    labs(title = 'Total Offenses Percentage (#) by Month/Ward', x = '', y = '')
)
ggplotly(
  a %>% group_by(monthkey, ward)%>%
    summarise(total.offenses = n()) %>%
    mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>% 
    ungroup() %>% 
    mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>% 
    ggplot(aes(monthkey, total.offenses.pct, fill = ward)) +
    geom_area() +
    scale_y_continuous(labels = scales::percent) +
    labs(title = 'Total Offenses Percentage (%) by Month/Ward', x = '', y = '')
) 

EDA: num vars

sample data

a %>% select(where(is.numeric)) %>% sample_n(10)

glimpse structure

a %>% select(where(is.numeric)) %>% glimpse
## Rows: 10,565
## Columns: 2
## $ xblock <dbl> -76.92713, -76.99716, -77.00740, -76.94992, -77.01060, -77.0...
## $ yblock <dbl> 38.90078, 38.92093, 38.90645, 38.86128, 38.92023, 38.87786, ...

check missing values

a %>% select(where(is.numeric)) %>% miss_var_summary

quick map

a %>% plot_ly(x = ~xblock, y = ~yblock, color = ~ward) %>% add_markers() %>% layout(title = 'WDC Wards')
a %>% plot_ly(x = ~xblock, y = ~yblock, color = ~census_tract) %>% add_markers() %>% layout(title = 'WDC Wards Census Tracts') %>% hide_legend()
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

Anomaly Detection

Total Offenses by Month

library(anomalize)
## == Use anomalize to improve your Forecasts by 50%! ==============================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)

# alpha: Controls the width of the "normal" range. Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.

# max_anoms: The maximum percent of anomalies permitted to be identified.

a.anomalize = a %>%
  group_by(monthkey) %>% 
  summarise(total.offenses = n()) %>%
  time_decompose(total.offenses, method = 'twitter', merge = TRUE) %>%
  anomalize(remainder, alpha = 0.30, method = 'gesd') %>%
  time_recompose()
## `summarise()` ungrouping output (override with `.groups` argument)
## Converting from tbl_df to tbl_time.
## Auto-index message: index = monthkey
## frequency = 12 months
## median_span = 35.5 months
ggplotly(
a.anomalize %>%
  filter(monthkey < as.Date('2017-11-01')) %>% 
  plot_anomalies(
    ncol = 1,
    alpha_dots = 0.5,
    alpha_circles = 0.5,
    size_circles = 1.5,
    time_recomposed = TRUE,
    alpha_ribbon = 0.05
    ) + scale_y_continuous(labels = comma) +
  labs(x = '', y = 'total.offenses', title = 'total.offenses')
)

There was statistically high crime activity in November 2014 There was statistically low crime activity in August & September 2017

Predict Total Offenses for the next 4 Weeks

a %>% count(monthkey, crimetype, name = 'count') %>%
  select(-monthkey) %>%
  plot_ly(y = ~crimetype, x = ~count, color = ~crimetype) %>%
  add_boxplot() %>% 
  layout(
    title = 'Daily Crime Cases by Crime Type ',
    xaxis = list(title ='Roughly 5 Year Distribution'),
    yaxis = list(title ='')
    )
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

Create Model

library(prophet)

#renaming cols to prophet's col conventions
a.agg = a %>% filter(crimetype == 'Violent') %>% 
  group_by(report_dat = round_date(report_dat,'day')) %>% 
  summarise(total.offenses = n()) %>% 
  select(ds = report_dat, y = total.offenses)

#creating model
a.agg.model = a.agg %>% prophet()

#using model make future period df
a.agg.future = a.agg.model %>% make_future_dataframe(
  periods = 28, #next 4 wks
  freq = 'day') 

#make forecasts df
a.agg.forecast = a.agg.model %>% predict(a.agg.future)

#plot forecast
a.agg.model %>% dyplot.prophet(a.agg.forecast)
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#plot forecast components
a.agg.model %>% prophet_plot_components(a.agg.forecast)

Mondays are more dangerous than other days of the week

Leaflet

Preprocessing

Simple Modeling

Complex Modeling

Auto Modeling